{
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  },
  "name": "",
  "signature": "sha256:68282380208d53bec835592c26ebf58ef4588aa48bb8a3cfb04724959f25e0b5"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "import h2o\n",
      "import csv\n",
      "import time\n",
      "import numpy as np\n",
      "import matplotlib.pyplot as plt\n",
      "from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator\n",
      "from h2o.estimators.deeplearning import H2ODeepLearningEstimator"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "h2o.init()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
      "\n",
      "# Import and parse ACS 2013 5-year DP02 demographic data\n",
      "acs_orig = h2o.upload_file(path=_locate(\"bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip\"), col_types = ([\"enum\"] + [\"numeric\"]*149))\n",
      "acs_orig.describe()\n",
      "\n",
      "acs_zcta_col = acs_orig[\"ZCTA5\"].asfactor()\n",
      "acs_full = acs_orig.drop(\"ZCTA5\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Import and parse WHD 2014-2015 labor violations data\n",
      "whd_zcta = h2o.upload_file(path=_locate(\"bigdata/laptop/census/whd_zcta_cleaned.zip\"), col_types = ([\"enum\"]*7 + [\"numeric\"]*97))\n",
      "whd_zcta[\"zcta5_cd\"] = whd_zcta[\"zcta5_cd\"].asfactor()\n",
      "whd_zcta.describe()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Run GLRM to reduce ZCTA demographics to 10 archetypes\n",
      "acs_model = H2OGeneralizedLowRankEstimator(k = 10,\n",
      "                                           transform = \"STANDARDIZE\",\n",
      "                                           loss = \"Quadratic\",\n",
      "                                           regularization_x = \"Quadratic\",\n",
      "                                           regularization_y = \"L1\",\n",
      "                                           gamma_x = 0.25,\n",
      "                                           gamma_y = 0.5,\n",
      "                                           max_iterations = 100)\n",
      "acs_model.train(x = acs_full.names, training_frame= acs_full)\n",
      "print acs_model"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Plot objective function value each iteration\n",
      "acs_model_score = acs_model.score_history()\n",
      "plt.xlabel(\"Iteration\")\n",
      "plt.ylabel(\"Objective\")\n",
      "plt.title(\"Objective Function Value per Iteration\")\n",
      "plt.plot(acs_model_score[\"iteration\"], acs_model_score[\"objective\"])\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Embedding of ZCTAs into archetypes (X)\n",
      "zcta_arch_x = h2o.get_frame(acs_model._model_json[\"output\"][\"representation_name\"])\n",
      "zcta_arch_x.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Plot a few ZCTAs on the first two archetypes\n",
      "idx = ((acs_zcta_col == \"10065\") |   # Manhattan, NY (Upper East Side)\n",
      "       (acs_zcta_col == \"11219\") |   # Manhattan, NY (East Harlem)\n",
      "       (acs_zcta_col == \"66753\") |   # McCune, KS\n",
      "       (acs_zcta_col == \"84104\") |   # Salt Lake City, UT\n",
      "       (acs_zcta_col == \"94086\") |   # Sunnyvale, CA\n",
      "       (acs_zcta_col == \"95014\"))    # Cupertino, CA\n",
      "\n",
      "city_arch = np.array(h2o.as_list(zcta_arch_x[idx,[0,1]]))\n",
      "plt.xlabel(\"First Archetype\")\n",
      "plt.ylabel(\"Second Archetype\")\n",
      "plt.title(\"Archetype Representation of Zip Code Tabulation Areas\")\n",
      "plt.plot(city_arch[:,0], city_arch[:,1], \"o\")\n",
      "\n",
      "# Label city names corresponding to ZCTAs\n",
      "city_names = [\"Upper East Side\", \"East Harlem\", \"McCune\", \"Salt Lake City\", \"Sunnyvale\", \"Cupertino\"]\n",
      "for i, txt in enumerate(city_names):\n",
      "   plt.annotate(txt, (city_arch[i,0], city_arch[i,1]))\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Archetypes to full feature mapping (Y)\n",
      "arch_feat_y = acs_model._model_json[\"output\"][\"archetypes\"]\n",
      "print arch_feat_y"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Split WHD data into test/train with 20/80 ratio\n",
      "split = whd_zcta[\"flsa_repeat_violator\"].runif()\n",
      "train = whd_zcta[split <= 0.8]\n",
      "test = whd_zcta[split > 0.8]\n",
      "\n",
      "# Build a DL model to predict repeat violators and score\n",
      "s = time.time()\n",
      "dl_orig = H2ODeepLearningEstimator(epochs = 0.1, hidden = [50,50,50], distribution = \"multinomial\")\n",
      "idx_x = train.names\n",
      "idx_x.remove(\"flsa_repeat_violator\")\n",
      "idx_x = idx_x[4:]\n",
      "dl_orig.train(x               =idx_x,\n",
      "              y               =\"flsa_repeat_violator\",\n",
      "              training_frame  =train,\n",
      "              validation_frame=test)\n",
      "orig_elapsed = time.time() - s"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Replace zcta5_cd column in WHD data with GLRM archetypes\n",
      "zcta_arch_x[\"zcta5_cd\"] = acs_zcta_col\n",
      "whd_arch = whd_zcta.merge(zcta_arch_x, all_x = True, all_y = False)\n",
      "whd_arch = whd_arch.drop(\"zcta5_cd\")\n",
      "whd_arch.describe()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Split WHD data into test/train with 20/80 ratio\n",
      "train_mod = whd_arch[split <= 0.8]\n",
      "test_mod = whd_arch[split > 0.8]\n",
      "\n",
      "# Build a DL model to predict repeat violators and score\n",
      "s = time.time()\n",
      "dl_mod = H2ODeepLearningEstimator(epochs = 0.1, hidden = [50,50,50], distribution = \"multinomial\")\n",
      "\n",
      "dl_mod.train(x               =idx_x,\n",
      "             y               =\"flsa_repeat_violator\",\n",
      "             training_frame  =train,\n",
      "             validation_frame=test)\n",
      "\n",
      "mod_elapsed = time.time() - s"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Model performance comparison\n",
      "train_ll_orig = dl_orig.model_performance(train).logloss()\n",
      "test_ll_orig  = dl_orig.model_performance(test ).logloss()\n",
      "train_ll_mod  = dl_mod .model_performance(train).logloss()\n",
      "test_ll_mod   = dl_mod .model_performance(test ).logloss()\n",
      "\n",
      "# Print results in pretty HTML table\n",
      "header = [\"Metric\"   , \"Original\"    , \"Reduced\"    ]\n",
      "table = [\n",
      "         [\"Runtime\"  , orig_elapsed  , mod_elapsed  ],\n",
      "         [\"Train LogLoss\", train_ll_orig, train_ll_mod],\n",
      "         [\"Test LogLoss\" , test_ll_orig , test_ll_mod ],\n",
      "        ]\n",
      "h2o.display.H2ODisplay(table,header)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}